In [1]:
# Logistic Regression on Titanic Dataset
# Goal: Predict survival on the Titanic using logistic regression.
# Broad Steps to Solve the Problem:
# 1. Import necessary libraries
# 2. Load the dataset
# 3. Explore the dataset ( to see if there are any missing values, outliers, etc., and also to check if there is linear relationship between the features and the target variable)
# 4. Encoding categorical variables (if any)
# 5. Separate the independent variables (features) to X and dependent variable (target) to y
# 6. Split the dataset into training and testing sets
# 7. Scale the independent variables (if necessary). You will not scale the dependent variable.
# 8. Modeling - apply the Multiple Linear Regression algorithm or any other algorithm of your choice
# 9. Make predictions on the test set
# 10. Evaluate the model performance using appropriate metrics (e.g., R-squared, Mean Absolute Error, etc.)
# 11. Visualize the results (if necessary)
# 12. Save the model (if necessary)
# 13. Document the findings and conclusions
# 14. Share the results with stakeholders (if necessary)
# 15. Deploy the model (if necessary)
In [2]:
# Lets do the necessary imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
In [3]:
df = pd.read_csv('titanic.csv')
df.head()
Out[3]:
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
In [4]:
df.shape
Out[4]:
(891, 12)
In [5]:
# Note:
# Our data set was already explored during our EDA class. So by default, it is expected that you explore every project before coming for any conclusion. Since we have already explored and we remember the findings, I will be doing minimal EDA and straightaway jumping into modelling.
# However, for any real life use case you are strongly recommended to focus more on EDA to extract the relevant features and get the insights from the data before proceeding to modelling.
In [6]:
df.isna().sum().sort_values(ascending=False)
Out[6]:
Cabin 687 Age 177 Embarked 2 PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 SibSp 0 Parch 0 Ticket 0 Fare 0 dtype: int64
In [7]:
# Cabin has a lot of missing values, so we will drop it.
df.drop('Cabin', axis=1, inplace=True)
df.isna().sum().sort_values(ascending=False)
Out[7]:
Age 177 Embarked 2 PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 SibSp 0 Parch 0 Ticket 0 Fare 0 dtype: int64
In [8]:
df.Embarked.value_counts()
Out[8]:
Embarked S 644 C 168 Q 77 Name: count, dtype: int64
In [9]:
# Since most of the embarked values are S, we will fill the missing values with S.
df['Embarked'].fillna('S', inplace=True)
df.isna().sum().sort_values(ascending=False)
Out[9]:
Age 177 PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Embarked 0 dtype: int64
In [10]:
df.Age.skew()
Out[10]:
0.38910778230082704
In [11]:
# If this value is between:
# ==> -0.5 and 0.5, the distribution of the value is almost symmetrical
# ==> -1 and -0.5, the data is negatively skewed, and if it is between 0.5 to 1, the data is positively skewed. The skewness is moderate.
# ==> If the skewness is lower than -1 (negatively skewed) or greater than 1 (positively skewed), the data is highly skewed.
In [12]:
# Since the skewness is between -0.5 and 0.5, we can say that the distribution of the age is almost symmetrical.
# Lets fill the missing values with the mean of the age.
df['Age'].fillna(df['Age'].mean(), inplace=True)
# Now lets check if there are any missing values left.
df.isna().sum().sort_values(ascending=False)
Out[12]:
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Embarked 0 dtype: int64
In [13]:
df.head()
Out[13]:
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S |
In [14]:
# Feature Engineering
# FamilySize = SibSp + Parch + 1
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df.head()
Out[14]:
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | FamilySize | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S | 2 |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C | 2 |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S | 1 |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S | 2 |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S | 1 |
In [15]:
# isAlone = 1 if FamilySize == 1 else 0
df['isAlone'] = np.where(df['FamilySize'] == 1, 1, 0)
df.head()
Out[15]:
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | FamilySize | isAlone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S | 2 | 0 |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C | 2 | 0 |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S | 1 | 1 |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S | 2 | 0 |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S | 1 | 1 |
In [16]:
# GenderClass
# If Age<15, then child
# else Sex (male/female)
df['GenderClass'] = df.apply(lambda dff: 'child' if dff['Age'] < 15 else dff['Sex'], axis=1)
df.head()
Out[16]:
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | FamilySize | isAlone | GenderClass | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S | 2 | 0 | male |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C | 2 | 0 | female |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S | 1 | 1 | female |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S | 2 | 0 | female |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S | 1 | 1 | male |
In [17]:
df.GenderClass.value_counts()
Out[17]:
GenderClass male 538 female 275 child 78 Name: count, dtype: int64
In [18]:
# Title Extraction
# Extracting the title from the Name column
df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
df.head()
Out[18]:
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | FamilySize | isAlone | GenderClass | Title | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S | 2 | 0 | male | Mr |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C | 2 | 0 | female | Mrs |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S | 1 | 1 | female | Miss |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S | 2 | 0 | female | Mrs |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S | 1 | 1 | male | Mr |
In [19]:
df.Title.value_counts()
Out[19]:
Title Mr 517 Miss 182 Mrs 125 Master 40 Dr 7 Rev 6 Mlle 2 Major 2 Col 2 the Countess 1 Capt 1 Ms 1 Sir 1 Lady 1 Mme 1 Don 1 Jonkheer 1 Name: count, dtype: int64
In [20]:
# Lets group the titles into broader categories
# Grouping titles into broader categories
title_mapping = {
'Mr': 'Mr',
'Mrs': 'Mrs',
'Miss': 'Miss',
'Master': 'Master',
'Don': 'Mr',
'Rev': 'Mr',
'Dr': 'Mr',
'Mme': 'Mrs',
'Ms': 'Miss',
'Major': 'Mr',
'Lady': 'Mrs',
'Sir': 'Mr',
'Mlle': 'Miss',
'Col': 'Mr',
'Capt': 'Mr',
'the Countess': 'Mrs',
'Jonkheer': 'Mr'
}
df['Title'] = df['Title'].map(title_mapping)
# Now lets check the value counts of the titles
df.Title.value_counts()
Out[20]:
Title Mr 538 Miss 185 Mrs 128 Master 40 Name: count, dtype: int64
In [21]:
df.head()
Out[21]:
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | FamilySize | isAlone | GenderClass | Title | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S | 2 | 0 | male | Mr |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C | 2 | 0 | female | Mrs |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S | 1 | 1 | female | Miss |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S | 2 | 0 | female | Mrs |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S | 1 | 1 | male | Mr |
In [22]:
# Drop columns that are not needed for the model: PassengerId, Name, Sex, SibSp, Parch, Ticket
df.drop(['PassengerId', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket'], axis=1, inplace=True)
df.head()
Out[22]:
| Survived | Pclass | Age | Fare | Embarked | FamilySize | isAlone | GenderClass | Title | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 22.0 | 7.2500 | S | 2 | 0 | male | Mr |
| 1 | 1 | 1 | 38.0 | 71.2833 | C | 2 | 0 | female | Mrs |
| 2 | 1 | 3 | 26.0 | 7.9250 | S | 1 | 1 | female | Miss |
| 3 | 1 | 1 | 35.0 | 53.1000 | S | 2 | 0 | female | Mrs |
| 4 | 0 | 3 | 35.0 | 8.0500 | S | 1 | 1 | male | Mr |
In [23]:
# One-hot encoding / Dummification of columns: Embarked, GenderClass, Title
df = pd.get_dummies(df, columns=['Embarked', 'GenderClass', 'Title'], drop_first=True, dtype=int)
df.head()
Out[23]:
| Survived | Pclass | Age | Fare | FamilySize | isAlone | Embarked_Q | Embarked_S | GenderClass_female | GenderClass_male | Title_Miss | Title_Mr | Title_Mrs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 22.0 | 7.2500 | 2 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
| 1 | 1 | 1 | 38.0 | 71.2833 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 2 | 1 | 3 | 26.0 | 7.9250 | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
| 3 | 1 | 1 | 35.0 | 53.1000 | 2 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 4 | 0 | 3 | 35.0 | 8.0500 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
In [24]:
# Split the data into features and target variable
X = df.drop('Survived', axis=1)
y = df['Survived']
In [25]:
# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
# Print the shapes of the train and test sets
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
X_train shape: (712, 12), y_train shape: (712,) X_test shape: (179, 12), y_test shape: (179,)
In [26]:
X_train.head()
Out[26]:
| Pclass | Age | Fare | FamilySize | isAlone | Embarked_Q | Embarked_S | GenderClass_female | GenderClass_male | Title_Miss | Title_Mr | Title_Mrs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 502 | 3 | 29.699118 | 7.6292 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| 464 | 3 | 29.699118 | 8.0500 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
| 198 | 3 | 29.699118 | 7.7500 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| 765 | 1 | 51.000000 | 77.9583 | 2 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 421 | 3 | 21.000000 | 7.7333 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
In [27]:
features_to_be_scaled = ['Age', 'Fare']
# Scaling the features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[features_to_be_scaled] = sc.fit_transform(X_train[features_to_be_scaled])
X_test[features_to_be_scaled] = sc.transform(X_test[features_to_be_scaled])
In [28]:
X_train.head()
Out[28]:
| Pclass | Age | Fare | FamilySize | isAlone | Embarked_Q | Embarked_S | GenderClass_female | GenderClass_male | Title_Miss | Title_Mr | Title_Mrs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 502 | 3 | 0.014210 | -0.482061 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| 464 | 3 | 0.014210 | -0.474085 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
| 198 | 3 | 0.014210 | -0.479771 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| 765 | 1 | 1.674826 | 0.850885 | 2 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 421 | 3 | -0.663973 | -0.480088 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
In [29]:
# Modeling - apply the Logistic Regression algorithm
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
# Predicting on training and test sets
y_pred_train = logreg.predict(X_train)
y_pred_test = logreg.predict(X_test)
# Evaluating the model performance
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)
print(f"Training Accuracy: {train_acc}")
print(f"Testing Accuracy: {test_acc}")
Training Accuracy: 0.8328651685393258 Testing Accuracy: 0.8156424581005587
| Aspect | Observation |
|---|---|
| Accuracy Score | Both training and testing accuracies are above 80%, which is strong baseline performance for this dataset. |
| No Signs of Overfitting | Since training and testing scores are close, model does not overfit. |
| Suitable for Deployment | Given the balanced accuracy and simplicity of logistic regression, this model can serve as a baseline classifier. |
| Potential Next Steps | Consider adding more features, trying regularization (C parameter), or comparing with other classifiers (e.g., Random Forest, XGBoost) to improve performance. |
In [30]:
confusion_matrix(y_test, y_pred_test)
Out[30]:
array([[98, 12],
[21, 48]])
In [34]:
print(classification_report(y_test, y_pred_test))
precision recall f1-score support
0 0.82 0.89 0.86 110
1 0.80 0.70 0.74 69
accuracy 0.82 179
macro avg 0.81 0.79 0.80 179
weighted avg 0.81 0.82 0.81 179
In [31]:
confusion_mat = confusion_matrix(y_test, y_pred_test)
cm_df = pd.DataFrame(confusion_mat, index=['Actually Died', 'Actually Survived'],
columns=['Predicted Died', 'Predicted Survived'])
cm_df
Out[31]:
| Predicted Died | Predicted Survived | |
|---|---|---|
| Actually Died | 98 | 12 |
| Actually Survived | 21 | 48 |
Confusion Matrix:¶
| Predicted Died | Predicted Survived | |
|---|---|---|
| Actually Died | 98 (True Negatives) | 12 (False Positives) |
| Actually Survived | 21 (False Negatives) | 48 (True Positives) |
Key Metrics and Interpretations:¶
Accuracy:
$$ \text{Accuracy} = \frac{TP + TN}{TP + TN + FP + FN} = \frac{48 + 98}{48 + 98 + 12 + 21} = \frac{146}{179} \approx 81.56\% $$- The model correctly predicts the outcome about 81.56% of the time.
Precision (Survived):
$$ \text{Precision} = \frac{TP}{TP + FP} = \frac{48}{48 + 12} = \frac{48}{60} = 80\% $$- When the model predicts someone will survive, it's correct 80% of the time.
Recall (Survived):
$$ \text{Recall} = \frac{TP}{TP + FN} = \frac{48}{48 + 21} = \frac{48}{69} \approx 69.57\% $$- The model identifies about 69.57% of the actual survivors.
Precision (Died):
$$ \frac{TN}{TN + FN} = \frac{98}{98 + 21} = \frac{98}{119} \approx 82.35\% $$Recall (Died):
$$ \frac{TN}{TN + FP} = \frac{98}{98 + 12} = \frac{98}{110} \approx 89.09\% $$
Insights:¶
- The model is more accurate at predicting deaths than survivals, with better recall and precision on the "Died" class.
- It struggles more with false negatives (21 people who survived but were predicted to die), which is critical in survival scenarios.
- Balanced performance: While not perfect, both classes are reasonably well predicted, indicating the model is not heavily biased toward one outcome.
In [ ]:
# Calculate classification report
class_report = classification_report(y_test, y_pred_test, output_dict=True)
class_report_df = pd.DataFrame(class_report).transpose()
class_report_df
# f1 score is the harmonic mean of precision and recall.
# It is a measure of a model's accuracy that considers both the precision and the recall.
# It is particularly useful when the class distribution is imbalanced.
# f1 score formula:
# https://images.prismic.io/encord/0ef9c82f-2857-446e-918d-5f654b9d9133_Screenshot+%2849%29.png?auto=compress,format
Out[ ]:
| precision | recall | f1-score | support | |
|---|---|---|---|---|
| 0 | 0.823529 | 0.890909 | 0.855895 | 110.000000 |
| 1 | 0.800000 | 0.695652 | 0.744186 | 69.000000 |
| accuracy | 0.815642 | 0.815642 | 0.815642 | 0.815642 |
| macro avg | 0.811765 | 0.793281 | 0.800041 | 179.000000 |
| weighted avg | 0.814459 | 0.815642 | 0.812834 | 179.000000 |
| Metric | Class | Value | Interpretation |
|---|---|---|---|
| Precision | 0 | 0.8235 | 82.35% of predicted non-survivors were actually non-survivors |
| Precision | 1 | 0.8000 | 80.00% of predicted survivors were actually survivors |
| Recall | 0 | 0.8909 | Model identified 89.09% of actual non-survivors correctly |
| Recall | 1 | 0.6957 | Model identified 69.57% of actual survivors correctly |
| F1-Score | 0 | 0.8559 | Balanced performance for non-survivor prediction |
| F1-Score | 1 | 0.7442 | Balanced performance for survivor prediction |
| Support | 0 | 110 | Number of actual non-survivors in the test set |
| Support | 1 | 69 | Number of actual survivors in the test set |
Overall Evaluation Metrics¶
| Metric | Value | Interpretation |
|---|---|---|
| Accuracy | 0.8156 | Overall, 81.56% of total predictions are correct |
| Macro Avg F1 | 0.8000 | Average F1-score treating both classes equally (no weighting) |
| Weighted Avg F1 | 0.8128 | Average F1-score considering class imbalance (more realistic) |
| Macro Avg Recall | 0.7933 | Average recall of both classes, unweighted |
| Weighted Avg Precision | 0.8145 | Overall precision adjusted for class sizes |
In [36]:
logreg.predict(X_test)
Out[36]:
array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0])
In [ ]:
logreg.predict_proba(X_test)[:, 1]
In [37]:
logreg.predict_proba(X_test)
Out[37]:
array([[9.67908071e-01, 3.20919292e-02],
[9.32490994e-01, 6.75090064e-02],
[8.65873955e-01, 1.34126045e-01],
[9.23976595e-01, 7.60234054e-02],
[3.84356070e-02, 9.61564393e-01],
[6.20498119e-01, 3.79501881e-01],
[9.03077394e-01, 9.69226056e-02],
[9.13470087e-01, 8.65299135e-02],
[3.39534138e-01, 6.60465862e-01],
[9.06617655e-01, 9.33823455e-02],
[9.12827333e-01, 8.71726670e-02],
[5.96201798e-01, 4.03798202e-01],
[2.75296948e-01, 7.24703052e-01],
[9.27400014e-01, 7.25999856e-02],
[9.12827333e-01, 8.71726670e-02],
[5.21113456e-02, 9.47888654e-01],
[6.06922862e-01, 3.93077138e-01],
[9.01256533e-01, 9.87434674e-02],
[2.60674516e-01, 7.39325484e-01],
[4.01662220e-01, 5.98337780e-01],
[9.31276462e-01, 6.87235384e-02],
[8.58004847e-01, 1.41995153e-01],
[6.99629270e-01, 3.00370730e-01],
[2.56303571e-01, 7.43696429e-01],
[7.05838749e-01, 2.94161251e-01],
[2.43295899e-01, 7.56704101e-01],
[9.51929778e-01, 4.80702216e-02],
[6.95653752e-01, 3.04346248e-01],
[9.24910963e-01, 7.50890366e-02],
[9.77069011e-01, 2.29309894e-02],
[5.43012866e-01, 4.56987134e-01],
[9.27400014e-01, 7.25999856e-02],
[5.47293715e-02, 9.45270628e-01],
[8.88056804e-01, 1.11943196e-01],
[3.98296262e-02, 9.60170374e-01],
[9.60492921e-01, 3.95070794e-02],
[9.13035201e-01, 8.69647987e-02],
[3.31748978e-01, 6.68251022e-01],
[1.37269217e-01, 8.62730783e-01],
[2.86207326e-01, 7.13792674e-01],
[8.08235733e-01, 1.91764267e-01],
[3.67540277e-01, 6.32459723e-01],
[5.80394893e-01, 4.19605107e-01],
[9.68583494e-01, 3.14165058e-02],
[8.46766468e-01, 1.53233532e-01],
[9.19318807e-01, 8.06811931e-02],
[1.54176510e-01, 8.45823490e-01],
[5.22740271e-02, 9.47725973e-01],
[9.56720169e-01, 4.32798306e-02],
[9.34672512e-01, 6.53274882e-02],
[9.15628594e-01, 8.43714058e-02],
[7.01049150e-01, 2.98950850e-01],
[9.08562354e-01, 9.14376463e-02],
[4.67207689e-01, 5.32792311e-01],
[7.87947144e-01, 2.12052856e-01],
[9.52242959e-01, 4.77570410e-02],
[9.40814439e-01, 5.91855614e-02],
[9.10074330e-01, 8.99256701e-02],
[4.04961059e-02, 9.59503894e-01],
[7.49863927e-02, 9.25013607e-01],
[7.65781503e-01, 2.34218497e-01],
[9.61099669e-01, 3.89003311e-02],
[3.44666821e-01, 6.55333179e-01],
[3.62929317e-01, 6.37070683e-01],
[9.25473994e-01, 7.45260063e-02],
[9.04530122e-01, 9.54698783e-02],
[6.33993386e-01, 3.66006614e-01],
[3.79729654e-01, 6.20270346e-01],
[3.86022571e-01, 6.13977429e-01],
[7.87947144e-01, 2.12052856e-01],
[7.19645365e-01, 2.80354635e-01],
[3.41635048e-01, 6.58364952e-01],
[3.45296195e-01, 6.54703805e-01],
[9.46015592e-01, 5.39844076e-02],
[5.47787202e-01, 4.52212798e-01],
[9.27570408e-01, 7.24295916e-02],
[9.09837005e-01, 9.01629947e-02],
[2.89837380e-02, 9.71016262e-01],
[3.41646077e-01, 6.58353923e-01],
[3.06634964e-01, 6.93365036e-01],
[6.34962408e-01, 3.65037592e-01],
[9.10942451e-01, 8.90575495e-02],
[5.73434945e-01, 4.26565055e-01],
[9.03001987e-01, 9.69980135e-02],
[9.53287912e-01, 4.67120879e-02],
[7.21847153e-02, 9.27815285e-01],
[9.18875604e-01, 8.11243955e-02],
[7.30602642e-01, 2.69397358e-01],
[4.79612417e-02, 9.52038758e-01],
[4.65091905e-01, 5.34908095e-01],
[5.75160632e-01, 4.24839368e-01],
[8.10918085e-02, 9.18908192e-01],
[4.91193360e-02, 9.50880664e-01],
[9.27400014e-01, 7.25999856e-02],
[4.88153074e-01, 5.11846926e-01],
[7.70340183e-01, 2.29659817e-01],
[3.45746107e-01, 6.54253893e-01],
[9.83302731e-01, 1.66972688e-02],
[9.05865105e-01, 9.41348952e-02],
[9.15370571e-01, 8.46294293e-02],
[3.41601959e-01, 6.58398041e-01],
[8.75703793e-01, 1.24296207e-01],
[4.63724767e-01, 5.36275233e-01],
[5.65473459e-01, 4.34526541e-01],
[3.15970295e-01, 6.84029705e-01],
[4.41414738e-02, 9.55858526e-01],
[4.74809434e-01, 5.25190566e-01],
[9.48474764e-01, 5.15252363e-02],
[9.19554599e-01, 8.04454011e-02],
[5.22327426e-02, 9.47767257e-01],
[8.86226334e-01, 1.13773666e-01],
[8.22740986e-01, 1.77259014e-01],
[8.28536898e-01, 1.71463102e-01],
[9.04343632e-01, 9.56563677e-02],
[6.86586837e-01, 3.13413163e-01],
[1.75278591e-01, 8.24721409e-01],
[7.96402273e-01, 2.03597727e-01],
[5.80994262e-01, 4.19005738e-01],
[9.14990373e-01, 8.50096273e-02],
[6.07040781e-01, 3.92959219e-01],
[9.22394910e-01, 7.76050901e-02],
[5.87895097e-02, 9.41210490e-01],
[9.13082463e-01, 8.69175374e-02],
[9.62333355e-01, 3.76666451e-02],
[8.88346320e-01, 1.11653680e-01],
[2.96837651e-01, 7.03162349e-01],
[8.59619363e-01, 1.40380637e-01],
[7.92191479e-01, 2.07808521e-01],
[9.27916730e-01, 7.20832698e-02],
[1.81208719e-01, 8.18791281e-01],
[9.34433617e-01, 6.55663826e-02],
[9.06617655e-01, 9.33823455e-02],
[3.41521054e-01, 6.58478946e-01],
[5.71424359e-01, 4.28575641e-01],
[8.16337740e-01, 1.83662260e-01],
[4.66930075e-01, 5.33069925e-01],
[9.15628594e-01, 8.43714058e-02],
[4.90409243e-01, 5.09590757e-01],
[7.95418207e-01, 2.04581793e-01],
[9.17066861e-01, 8.29331394e-02],
[9.83302731e-01, 1.66972688e-02],
[9.20732227e-01, 7.92677730e-02],
[9.14236159e-01, 8.57638414e-02],
[9.27359274e-01, 7.26407259e-02],
[5.51438379e-02, 9.44856162e-01],
[9.31227284e-01, 6.87727155e-02],
[9.49415192e-01, 5.05848081e-02],
[9.14977678e-01, 8.50223222e-02],
[9.26647919e-01, 7.33520813e-02],
[9.01649548e-01, 9.83504519e-02],
[8.71488763e-01, 1.28511237e-01],
[9.22530372e-01, 7.74696280e-02],
[4.87769795e-02, 9.51223021e-01],
[4.94119687e-01, 5.05880313e-01],
[5.02006072e-02, 9.49799393e-01],
[5.18099117e-02, 9.48190088e-01],
[2.78587130e-01, 7.21412870e-01],
[8.94569106e-01, 1.05430894e-01],
[9.86025730e-01, 1.39742697e-02],
[6.34084414e-01, 3.65915586e-01],
[2.63656537e-01, 7.36343463e-01],
[9.99188708e-01, 8.11292315e-04],
[6.73831774e-02, 9.32616823e-01],
[9.25670915e-01, 7.43290848e-02],
[3.70851239e-01, 6.29148761e-01],
[4.40582285e-02, 9.55941772e-01],
[9.64006268e-01, 3.59937322e-02],
[1.95900368e-01, 8.04099632e-01],
[6.79658376e-01, 3.20341624e-01],
[8.04546748e-01, 1.95453252e-01],
[9.27623103e-01, 7.23768966e-02],
[9.10567581e-01, 8.94324192e-02],
[9.09257210e-01, 9.07427904e-02],
[9.11914394e-01, 8.80856060e-02],
[7.70981769e-01, 2.29018231e-01],
[2.48598425e-01, 7.51401575e-01],
[9.18947456e-01, 8.10525437e-02],
[9.04453244e-01, 9.55467563e-02],
[8.92305350e-01, 1.07694650e-01]])
In [ ]:
# Calculate and plot AUC ROC curve
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:, 1])
# fpr - False Positive Rate
# tpr - True Positive Rate
# thresholds - Thresholds for the ROC curve
roc_auc = auc(fpr, tpr)
# auc - Area Under the Curve i.e. this will help us to understand the model performance better
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='blue', label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random Guessing')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.legend()
plt.grid()
plt.show()
| Metric | Meaning |
|---|---|
| AUC = 0.85 | This indicates that there's an 85% chance the model ranks a randomly chosen survivor higher than a randomly chosen non-survivor. |
| High TPR at Low FPR | The model correctly identifies most survivors early on, while making few false positive errors. |
| General Shape | The ROC curve is bowed toward the top-left, indicating good separation between classes. |
Conclusion¶
| Aspect | Assessment |
|---|---|
| Model Discrimination | Strong – able to distinguish survivors from non-survivors |
| Threshold Tuning Need | Optional – current threshold seems reasonable, but could be optimized for better recall or precision based on use-case |
| Next Step Suggestion | Consider precision-recall curve if the positive class (survived) is more critical to capture |
In [38]:
# Setting the threshold to 0.5
threshold = 0.5
logreg.predict_proba(X_test)[:, 1] > threshold
Out[38]:
array([False, False, False, False, True, False, False, False, True,
False, False, False, True, False, False, True, False, False,
True, True, False, False, False, True, False, True, False,
False, False, False, False, False, True, False, True, False,
False, True, True, True, False, True, False, False, False,
False, True, True, False, False, False, False, False, True,
False, False, False, False, True, True, False, False, True,
True, False, False, False, True, True, False, False, True,
True, False, False, False, False, True, True, True, False,
False, False, False, False, True, False, False, True, True,
False, True, True, False, True, False, True, False, False,
False, True, False, True, False, True, True, True, False,
False, True, False, False, False, False, False, True, False,
False, False, False, False, True, False, False, False, True,
False, False, False, True, False, False, True, False, False,
True, False, True, False, False, False, False, False, False,
True, False, False, False, False, False, False, False, True,
True, True, True, True, False, False, False, True, False,
True, False, True, True, False, True, False, False, False,
False, False, False, False, True, False, False, False])
In [39]:
# Setting the threshold to 0.75
threshold = 0.75
logreg.predict_proba(X_test)[:, 1] > threshold
Out[39]:
array([False, False, False, False, True, False, False, False, False,
False, False, False, False, False, False, True, False, False,
False, False, False, False, False, False, False, True, False,
False, False, False, False, False, True, False, True, False,
False, False, True, False, False, False, False, False, False,
False, True, True, False, False, False, False, False, False,
False, False, False, False, True, True, False, False, False,
False, False, False, False, False, False, False, False, False,
False, False, False, False, False, True, False, False, False,
False, False, False, False, True, False, False, True, False,
False, True, True, False, False, False, False, False, False,
False, False, False, False, False, False, True, False, False,
False, True, False, False, False, False, False, True, False,
False, False, False, False, True, False, False, False, False,
False, False, False, True, False, False, False, False, False,
False, False, False, False, False, False, False, False, False,
True, False, False, False, False, False, False, False, True,
False, True, True, False, False, False, False, False, False,
True, False, False, True, False, True, False, False, False,
False, False, False, False, True, False, False, False])
In [40]:
# Setting the threshold to 0.25
threshold = 0.25
logreg.predict_proba(X_test)[:, 1] > threshold
Out[40]:
array([False, False, False, False, True, True, False, False, True,
False, False, True, True, False, False, True, True, False,
True, True, False, False, True, True, True, True, False,
True, False, False, True, False, True, False, True, False,
False, True, True, True, False, True, True, False, False,
False, True, True, False, False, False, True, False, True,
False, False, False, False, True, True, False, False, True,
True, False, False, True, True, True, False, True, True,
True, False, True, False, False, True, True, True, True,
False, True, False, False, True, False, True, True, True,
True, True, True, False, True, False, True, False, False,
False, True, False, True, True, True, True, True, False,
False, True, False, False, False, False, True, True, False,
True, False, True, False, True, False, False, False, True,
False, False, False, True, False, False, True, True, False,
True, False, True, False, False, False, False, False, False,
True, False, False, False, False, False, False, False, True,
True, True, True, True, False, False, True, True, False,
True, False, True, True, False, True, True, False, False,
False, False, False, False, True, False, False, False])
In [42]:
# Setting the threshold to 0.5
threshold = 0.5
pred_05 = np.where(logreg.predict_proba(X_test)[:, 1] > threshold, 1, 0)
# Setting the threshold to 0.75
threshold = 0.75
pred_075 = np.where(logreg.predict_proba(X_test)[:, 1] > threshold, 1, 0)
# Setting the threshold to 0.25
threshold = 0.25
pred_025 = np.where(logreg.predict_proba(X_test)[:, 1] > threshold, 1, 0)
# Printing predictions for different thresholds
print("Predictions with threshold 0.5:", pred_05)
print("\n\nPredictions with threshold 0.75:", pred_075)
print("\n\nPredictions with threshold 0.25:", pred_025)
Predictions with threshold 0.5: [0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 1 1 1 0 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 1 1 0 1 1 0 1 0 1 0 0 0 1 0 1 0 1 1 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0] Predictions with threshold 0.75: [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0] Predictions with threshold 0.25: [0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 1 1 0 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 1 0 0 1 1 1 0 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 0 1 1 0 0 1 1 1 0 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 1 0 1 0 1 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 0 0 1 1 0 1 0 1 0 1 0 0 0 1 0 0 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 1 1 0 1 0 1 1 0 1 1 0 0 0 0 0 0 1 0 0 0]
In [51]:
# Lets try the model performance with different thresholds
numbers = [float(x)/20 for x in range(0, 11)]
print("Thresholds:", numbers)
cutoff_df = pd.DataFrame(zip(y_test, logreg.predict_proba(X_test)[:, 1], np.where(logreg.predict_proba(X_test)[:, 1] > 0.5, 1, 0)),
columns=['Actual', 'Predicted_Prob', 'Predicted_Class'])
cutoff_df
Thresholds: [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
Out[51]:
| Actual | Predicted_Prob | Predicted_Class | |
|---|---|---|---|
| 0 | 0 | 0.032092 | 0 |
| 1 | 0 | 0.067509 | 0 |
| 2 | 0 | 0.134126 | 0 |
| 3 | 0 | 0.076023 | 0 |
| 4 | 1 | 0.961564 | 1 |
| ... | ... | ... | ... |
| 174 | 0 | 0.229018 | 0 |
| 175 | 1 | 0.751402 | 1 |
| 176 | 0 | 0.081053 | 0 |
| 177 | 0 | 0.095547 | 0 |
| 178 | 0 | 0.107695 | 0 |
179 rows × 3 columns
In [53]:
# Lets try the model performance with different thresholds
numbers = [float(x)/10 for x in range(0, 11)]
print("Thresholds:", numbers)
for threshold in numbers:
cutoff_df[threshold] = cutoff_df.Predicted_Prob.map(lambda x: 1 if x > threshold else 0)
cutoff_df.head()
Thresholds: [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
Out[53]:
| Actual | Predicted_Prob | Predicted_Class | 0.0 | 0.05 | 0.1 | 0.15 | 0.2 | 0.25 | 0.3 | 0.35 | 0.4 | 0.45 | 0.5 | 0.6 | 0.7 | 0.8 | 0.9 | 1.0 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.032092 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0.067509 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 0.134126 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0.076023 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 1 | 0.961564 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 |
In [54]:
from sklearn.metrics import confusion_matrix
# TP = confusion_matrix[1,1] # true positive
# TN = confusion_matrix[0,0] # true negative
# FP = confusion_matrix[0,1] # false positive
# FN = confusion_matrix[1,0] # false negative
num = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
cutoff_df1 = pd.DataFrame(columns=['prob', 'accuracy', 'sensi', 'speci'])
for threshold in num:
cm1 = confusion_matrix(cutoff_df.Actual, cutoff_df[threshold])
total1 = cm1.sum()
Accuracy = (cm1[0, 0] + cm1[1, 1]) / total1
Specificity = cm1[0, 0] / (cm1[0, 0] + cm1[0, 1])
Sensitivity = cm1[1, 1] / (cm1[1, 1] + cm1[1, 0])
cutoff_df1.loc[threshold] = [threshold, Accuracy, Sensitivity, Specificity]
cutoff_df1
Out[54]:
| prob | accuracy | sensi | speci | |
|---|---|---|---|---|
| 0.0 | 0.0 | 0.385475 | 1.000000 | 0.000000 |
| 0.1 | 0.1 | 0.675978 | 0.855072 | 0.563636 |
| 0.2 | 0.2 | 0.731844 | 0.811594 | 0.681818 |
| 0.3 | 0.3 | 0.787709 | 0.797101 | 0.781818 |
| 0.4 | 0.4 | 0.787709 | 0.724638 | 0.827273 |
| 0.5 | 0.5 | 0.815642 | 0.695652 | 0.890909 |
| 0.6 | 0.6 | 0.798883 | 0.608696 | 0.918182 |
| 0.7 | 0.7 | 0.765363 | 0.449275 | 0.963636 |
| 0.8 | 0.8 | 0.748603 | 0.362319 | 0.990909 |
| 0.9 | 0.9 | 0.720670 | 0.289855 | 0.990909 |
| 1.0 | 1.0 | 0.614525 | 0.000000 | 1.000000 |
In [55]:
# Lets plot the accuracy, sensitivity, and specificity against the threshold
plt.figure(figsize=(12, 6))
plt.plot(cutoff_df1['prob'], cutoff_df1['accuracy'], marker='o', label='Accuracy')
plt.plot(cutoff_df1['prob'], cutoff_df1['sensi'], marker='o', label='Sensitivity')
plt.plot(cutoff_df1['prob'], cutoff_df1['speci'], marker='o', label='Specificity')
plt.title('Model Performance Metrics vs Threshold')
plt.xlabel('Threshold')
plt.ylabel('Metric Value')
plt.legend()
plt.grid()
plt.show()
In [64]:
# Since the best value of threshold is 0.33 as per the above graph, we will use this threshold for our final predictions.
threshold = 0.33
final_predictions = np.where(logreg.predict_proba(X_test)[:, 1] > threshold, 1, 0)
final_predictions
Out[64]:
array([0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0])
In [65]:
accuracy_score(y_test, final_predictions)
Out[65]:
0.7988826815642458
In [74]:
# My Style of Coding:
# Since the best value of threshold is 0.33 as per the above graph, we will use this threshold for our final predictions.
threshold = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
final_predictions_thr00 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.0, 1, 0)
final_predictions_thr05 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.05, 1, 0)
final_predictions_thr10 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.1, 1, 0)
final_predictions_thr15 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.15, 1, 0)
final_predictions_thr20 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.2, 1, 0)
final_predictions_thr25 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.25, 1, 0)
final_predictions_thr30 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.3, 1, 0)
final_predictions_thr35 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.35, 1, 0)
final_predictions_thr40 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.4, 1, 0)
final_predictions_thr45 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.45, 1, 0)
final_predictions_thr50 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.5, 1, 0)
final_predictions_thr55 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.55, 1, 0)
final_predictions_thr60 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.6, 1, 0)
final_predictions_thr65 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.65, 1, 0)
final_predictions_thr70 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.7, 1, 0)
final_predictions_thr75 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.75, 1, 0)
final_predictions_thr80 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.8, 1, 0)
final_predictions_thr85 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.85, 1, 0)
final_predictions_thr90 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.9, 1, 0)
final_predictions_thr95 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.95, 1, 0)
final_predictions_thr100 = np.where(logreg.predict_proba(X_test)[:, 1] > 1.0, 1, 0)
# Calculating accuracy for different thresholds
from sklearn.metrics import accuracy_score
accuracy_thr00 = accuracy_score(y_test, final_predictions_thr00)
accuracy_thr05 = accuracy_score(y_test, final_predictions_thr05)
accuracy_thr10 = accuracy_score(y_test, final_predictions_thr10)
accuracy_thr15 = accuracy_score(y_test, final_predictions_thr15)
accuracy_thr20 = accuracy_score(y_test, final_predictions_thr20)
accuracy_thr25 = accuracy_score(y_test, final_predictions_thr25)
accuracy_thr30 = accuracy_score(y_test, final_predictions_thr30)
accuracy_thr35 = accuracy_score(y_test, final_predictions_thr35)
accuracy_thr40 = accuracy_score(y_test, final_predictions_thr40)
accuracy_thr45 = accuracy_score(y_test, final_predictions_thr45)
accuracy_thr50 = accuracy_score(y_test, final_predictions_thr50)
accuracy_thr55 = accuracy_score(y_test, final_predictions_thr55)
accuracy_thr60 = accuracy_score(y_test, final_predictions_thr60)
accuracy_thr65 = accuracy_score(y_test, final_predictions_thr65)
accuracy_thr70 = accuracy_score(y_test, final_predictions_thr70)
accuracy_thr75 = accuracy_score(y_test, final_predictions_thr75)
accuracy_thr80 = accuracy_score(y_test, final_predictions_thr80)
accuracy_thr85 = accuracy_score(y_test, final_predictions_thr85)
accuracy_thr90 = accuracy_score(y_test, final_predictions_thr90)
accuracy_thr95 = accuracy_score(y_test, final_predictions_thr95)
accuracy_thr100 = accuracy_score(y_test, final_predictions_thr100)
print("Accuracy for threshold 0.00:", accuracy_thr00)
print("Accuracy for threshold 0.05:", accuracy_thr05)
print("Accuracy for threshold 0.10:", accuracy_thr10)
print("Accuracy for threshold 0.15:", accuracy_thr15)
print("Accuracy for threshold 0.20:", accuracy_thr20)
print("Accuracy for threshold 0.25:", accuracy_thr25)
print("Accuracy for threshold 0.30:", accuracy_thr30)
print("Accuracy for threshold 0.35:", accuracy_thr35)
print("Accuracy for threshold 0.40:", accuracy_thr40)
print("Accuracy for threshold 0.45:", accuracy_thr45)
print("Accuracy for threshold 0.50:", accuracy_thr50)
print("Accuracy for threshold 0.55:", accuracy_thr55)
print("Accuracy for threshold 0.60:", accuracy_thr60)
print("Accuracy for threshold 0.65:", accuracy_thr65)
print("Accuracy for threshold 0.70:", accuracy_thr70)
print("Accuracy for threshold 0.75:", accuracy_thr75)
print("Accuracy for threshold 0.80:", accuracy_thr80)
print("Accuracy for threshold 0.85:", accuracy_thr85)
print("Accuracy for threshold 0.90:", accuracy_thr90)
print("Accuracy for threshold 0.95:", accuracy_thr95)
print("Accuracy for threshold 1.00:", accuracy_thr100)
Accuracy for threshold 0.00: 0.3854748603351955 Accuracy for threshold 0.05: 0.4692737430167598 Accuracy for threshold 0.10: 0.6759776536312849 Accuracy for threshold 0.15: 0.7094972067039106 Accuracy for threshold 0.20: 0.7318435754189944 Accuracy for threshold 0.25: 0.776536312849162 Accuracy for threshold 0.30: 0.7877094972067039 Accuracy for threshold 0.35: 0.7988826815642458 Accuracy for threshold 0.40: 0.7877094972067039 Accuracy for threshold 0.45: 0.8044692737430168 Accuracy for threshold 0.50: 0.8156424581005587 Accuracy for threshold 0.55: 0.8044692737430168 Accuracy for threshold 0.60: 0.7988826815642458 Accuracy for threshold 0.65: 0.7821229050279329 Accuracy for threshold 0.70: 0.7653631284916201 Accuracy for threshold 0.75: 0.7597765363128491 Accuracy for threshold 0.80: 0.7486033519553073 Accuracy for threshold 0.85: 0.7262569832402235 Accuracy for threshold 0.90: 0.7206703910614525 Accuracy for threshold 0.95: 0.664804469273743 Accuracy for threshold 1.00: 0.6145251396648045
In [75]:
# My Style of Coding:
threshold = [0.45, 0.46, 0.47, 0.48, 0.49, 0.50, 0.51, 0.52, 0.53, 0.54, 0.55]
final_predictions_thr045 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.45, 1, 0)
final_predictions_thr046 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.46, 1, 0)
final_predictions_thr047 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.47, 1, 0)
final_predictions_thr048 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.48, 1, 0)
final_predictions_thr049 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.49, 1, 0)
final_predictions_thr050 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.50, 1, 0)
final_predictions_thr051 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.51, 1, 0)
final_predictions_thr052 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.52, 1, 0)
final_predictions_thr053 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.53, 1, 0)
final_predictions_thr054 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.54, 1, 0)
final_predictions_thr055 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.55, 1, 0)
# Calculating accuracy for different thresholds
accuracy_thr045 = accuracy_score(y_test, final_predictions_thr045)
accuracy_thr046 = accuracy_score(y_test, final_predictions_thr046)
accuracy_thr047 = accuracy_score(y_test, final_predictions_thr047)
accuracy_thr048 = accuracy_score(y_test, final_predictions_thr048)
accuracy_thr049 = accuracy_score(y_test, final_predictions_thr049)
accuracy_thr050 = accuracy_score(y_test, final_predictions_thr050)
accuracy_thr051 = accuracy_score(y_test, final_predictions_thr051)
accuracy_thr052 = accuracy_score(y_test, final_predictions_thr052)
accuracy_thr053 = accuracy_score(y_test, final_predictions_thr053)
accuracy_thr054 = accuracy_score(y_test, final_predictions_thr054)
accuracy_thr055 = accuracy_score(y_test, final_predictions_thr055)
print("Accuracy for threshold 0.45:", accuracy_thr045)
print("Accuracy for threshold 0.46:", accuracy_thr046)
print("Accuracy for threshold 0.47:", accuracy_thr047)
print("Accuracy for threshold 0.48:", accuracy_thr048)
print("Accuracy for threshold 0.49:", accuracy_thr049)
print("Accuracy for threshold 0.50:", accuracy_thr050)
print("Accuracy for threshold 0.51:", accuracy_thr051)
print("Accuracy for threshold 0.52:", accuracy_thr052)
print("Accuracy for threshold 0.53:", accuracy_thr053)
print("Accuracy for threshold 0.54:", accuracy_thr054)
print("Accuracy for threshold 0.55:", accuracy_thr055)
Accuracy for threshold 0.45: 0.8044692737430168 Accuracy for threshold 0.46: 0.8156424581005587 Accuracy for threshold 0.47: 0.8156424581005587 Accuracy for threshold 0.48: 0.8156424581005587 Accuracy for threshold 0.49: 0.8156424581005587 Accuracy for threshold 0.50: 0.8156424581005587 Accuracy for threshold 0.51: 0.8044692737430168 Accuracy for threshold 0.52: 0.8100558659217877 Accuracy for threshold 0.53: 0.8156424581005587 Accuracy for threshold 0.54: 0.8044692737430168 Accuracy for threshold 0.55: 0.8044692737430168
In [76]:
# Accuracy of 0.8156424581005587 is the best accuracy we got with threshold of 0.5 (some others as well)
# So we can say that the model is performing well with an accuracy of 81.56% on the test set at threshold of 0.5
In [ ]:
class sklearn.linear_model.LogisticRegression(penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='deprecated', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)¶
1. penalty (default='l2')¶
What it does: Specifies the regularization technique used to avoid overfitting.
Options:
'l1': Lasso regularization (can zero out coefficients).'l2': Ridge regularization (shrinks coefficients but doesn't zero them).'elasticnet': Mix of both (l1_ratiocontrols the balance).'none': No regularization.
Common Use:
'l2'is most common.
2. dual (default=False)¶
- What it does: Chooses the dual formulation of the optimization problem.
- When to use: Set to
Trueonly when usingliblinearsolver and when the number of samples < number of features. - Common Use: Usually
False.
3. tol (default=0.0001)¶
- What it does: Tolerance for stopping criteria. Smaller values make the model more precise but slower to train.
- Common Use: Default is fine in most cases.
4. C (default=1.0)¶
- What it does: Inverse of regularization strength. Smaller values mean stronger regularization.
- Tip: It’s the most important tuning parameter.
- Common Range:
0.01to100. Default is1.0.
5. fit_intercept (default=True)¶
- What it does: Adds an intercept (bias) term to the model.
- Common Use: Keep it
True, unless data is already centered.
6. intercept_scaling (default=1)¶
- What it does: Only used when
solver='liblinear'andfit_intercept=True. Scales the intercept term. - Common Use: Rarely changed.
7. class_weight (default=None)¶
- What it does: Adjusts weights for classes to handle imbalanced data.
- Common Values:
'balanced'or custom dictionary (e.g.,{0:1, 1:3}). - Common Use: Very useful in fraud, medical, or other imbalanced datasets.
8. random_state (default=None)¶
- What it does: Seed for random number generator. Helps ensure reproducibility.
- Common Use: Set to an integer like
42.
9. solver (default='lbfgs')¶
What it does: Optimization algorithm used to find the best coefficients.
Options:
'liblinear': Good for small datasets and supports'l1'.'lbfgs': Fast and works well for most cases.'saga': Supportsl1,l2, andelasticnet, good for large datasets.
Common Use:
'lbfgs'or'saga'.
10. max_iter (default=100)¶
- What it does: Maximum number of iterations for the solver to converge.
- Tip: Increase if model doesn’t converge (e.g., set to
500). - Common Use: Default is fine unless you get a convergence warning.
11. multi_class (default='deprecated')¶
- What it does: Determines how to handle multiple classes.
- Current Behavior: Automatically chosen based on solver.
- Common Values:
'ovr'(one-vs-rest),'multinomial'(better for multiclass). - Note: Explicitly set
'multinomial'when using'lbfgs'or'saga'with multiclass problems.
12. verbose (default=0)¶
- What it does: Controls how much output is shown during training.
- Common Use: Set to a positive integer if you want to debug.
13. warm_start (default=False)¶
- What it does: Reuses solution from previous fit to speed up next one.
- Common Use: Rarely used unless doing iterative training.
14. n_jobs (default=None)¶
- What it does: Number of CPU cores used. Set to
-1to use all available. - Common Use: Helps speed up training in large datasets.
15. l1_ratio (default=None)¶
- What it does: Only used when
penalty='elasticnet'. It balancesl1andl2regularization. - Common Use: Needs tuning if
elasticnetis selected. Value between 0 and 1.
Most Important and Commonly Tuned Parameters:¶
| Parameter | Why Important | Common Settings |
|---|---|---|
C |
Controls regularization strength | 0.01, 0.1, 1, 10 |
penalty |
Regularization method | 'l2', or 'l1' if sparse |
solver |
Impacts speed and regularization options | 'lbfgs', 'liblinear', 'saga' |
class_weight |
Handles imbalance | None or 'balanced' |
max_iter |
Needed for convergence in some cases | 100, 200, 500 |
In [77]:
# class sklearn.linear_model.LogisticRegression(penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='deprecated', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
from sklearn.linear_model import LogisticRegression
logreg2 = LogisticRegression(C=0.5)
logreg2.fit(X_train, y_train)
# Predicting on training and test sets with the new model
y_pred_train2 = logreg2.predict(X_train)
y_pred_test2 = logreg2.predict(X_test)
# Evaluating the new model performance
train_acc2 = accuracy_score(y_train, y_pred_train2)
test_acc2 = accuracy_score(y_test, y_pred_test2)
print(f"Training Accuracy with C=0.5: {train_acc2}")
print(f"Testing Accuracy with C=0.5: {test_acc2}")
Training Accuracy with C=0.5: 0.8314606741573034 Testing Accuracy with C=0.5: 0.8100558659217877
In [78]:
from sklearn.linear_model import LogisticRegression
logreg3 = LogisticRegression(C=1.5)
logreg3.fit(X_train, y_train)
# Predicting on training and test sets with the new model
y_pred_train3 = logreg3.predict(X_train)
y_pred_test3 = logreg3.predict(X_test)
# Evaluating the new model performance
train_acc3 = accuracy_score(y_train, y_pred_train3)
test_acc3 = accuracy_score(y_test, y_pred_test3)
print(f"Training Accuracy with C=1.5: {train_acc3}")
print(f"Testing Accuracy with C=1.5: {test_acc3}")
Training Accuracy with C=1.5: 0.8328651685393258 Testing Accuracy with C=1.5: 0.8100558659217877
In [79]:
# Try another set of Logistic Regression parameters
logreg4 = LogisticRegression(C=0.1, penalty='l1', solver='liblinear')
logreg4.fit(X_train, y_train)
# Predicting on training and test sets with the new model
y_pred_train4 = logreg4.predict(X_train)
y_pred_test4 = logreg4.predict(X_test)
# Evaluating the new model performance
train_acc4 = accuracy_score(y_train, y_pred_train4)
test_acc4 = accuracy_score(y_test, y_pred_test4)
print(f"Training Accuracy with C=0.1, penalty='l1': {train_acc4}")
print(f"Testing Accuracy with C=0.1, penalty='l1': {test_acc4}")
Training Accuracy with C=0.1, penalty='l1': 0.8061797752808989 Testing Accuracy with C=0.1, penalty='l1': 0.7821229050279329
In [80]:
# Note:
# What all can be do to improve the model performance?
# 1. Feature Engineering: Create new features or modify existing ones to better capture the relationships in the data.
# 2. Hyperparameter Tuning: Experiment with different values for hyperparameters like C, penalty, and solver to find the best combination for your model.
# 3. Change Train-Test Split Ratio: Adjust the ratio of training to testing data to see if it affects model performance.
# 4. Change the random_state in train_test_split to see if it affects model performance.
# 5. Try different scaling techniques like Min-Max Scaling or Robust Scaling.
# 6. Try different Machine Learning algorithms like Decision Trees, Random Forests, or Support Vector Machines to see if they perform better on the dataset.(later)
In [81]:
# Task to do now:
# https://www.kaggle.com/code/neisha/heart-disease-prediction-using-logistic-regression
Multiclass¶
In [82]:
# Logistic Regression for Multiclass Classification
# For multiclass classification, we can use the One-vs-Rest (OvR)
# Dataset: Iris dataset of Seaborn library
import seaborn as sns
iris = sns.load_dataset('iris')
iris.head()
Out[82]:
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
In [83]:
# Divide the dataset into features i.e. X and target variable i.e. y
X = iris.drop('species', axis=1)
y = iris['species']
In [84]:
# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
# Print the shapes of the train and test sets
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
X_train shape: (120, 4), y_train shape: (120,) X_test shape: (30, 4), y_test shape: (30,)
In [86]:
iris.columns
# Scale the features
features_to_be_scaled = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[features_to_be_scaled] = sc.fit_transform(X_train[features_to_be_scaled])
X_test[features_to_be_scaled] = sc.transform(X_test[features_to_be_scaled])
In [87]:
# Modeling - apply the Logistic Regression algorithm for multiclass classification
from sklearn.linear_model import LogisticRegression
logreg_multi = LogisticRegression(multi_class='ovr', random_state=0)
# multi_class='ovr' means One-vs-Rest strategy. This is cumpulsory for multiclass classification.
logreg_multi.fit(X_train, y_train)
# Predicting on training and test sets
y_pred_train_multi = logreg_multi.predict(X_train)
y_pred_test_multi = logreg_multi.predict(X_test)
# Evaluating the model performance
from sklearn.metrics import accuracy_score, confusion_matrix
train_acc_multi = accuracy_score(y_train, y_pred_train_multi)
test_acc_multi = accuracy_score(y_test, y_pred_test_multi)
print(f"Training Accuracy for Multiclass: {train_acc_multi}")
print(f"Testing Accuracy for Multiclass: {test_acc_multi}")
confusion_mat_multi = confusion_matrix(y_test, y_pred_test_multi)
cm_df_multi = pd.DataFrame(confusion_mat_multi, index=iris['species'].unique(),
columns=iris['species'].unique())
cm_df_multi
Training Accuracy for Multiclass: 0.925 Testing Accuracy for Multiclass: 0.9
Out[87]:
| setosa | versicolor | virginica | |
|---|---|---|---|
| setosa | 10 | 0 | 0 |
| versicolor | 0 | 8 | 2 |
| virginica | 0 | 1 | 9 |
In [95]:
# Predicting on X_test.iloc[0, :].values
logreg_multi.predict(X_test.iloc[0, :].values.reshape(1, -1))
Out[95]:
array(['setosa'], dtype=object)
In [96]:
# Predicting on X_test.iloc[0, :].values
logreg_multi.predict_proba(X_test.iloc[0, :].values.reshape(1, -1))
Out[96]:
array([[9.27179095e-01, 7.27948768e-02, 2.60285132e-05]])
In [97]:
np.sum(logreg_multi.predict_proba(X_test.iloc[0, :].values.reshape(1, -1)), axis=1)
Out[97]:
array([1.])
In [99]:
# Lets try one more
# Predicting on X_test.iloc[10, :].values
logreg_multi.predict(X_test.iloc[10, :].values.reshape(1, -1))
Out[99]:
array(['virginica'], dtype=object)
In [101]:
# Predicting on X_test.iloc[10, :].values
logreg_multi.predict_proba(X_test.iloc[10, :].values.reshape(1, -1))
Out[101]:
array([[0.00182399, 0.10009919, 0.89807683]])
In [102]:
np.sum(logreg_multi.predict_proba(X_test.iloc[10, :].values.reshape(1, -1)))
Out[102]:
1.0